# import libraries
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import plotly.graph_objects as go
np.random.seed(1)
tf.random.set_seed(1)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
df=pd.read_csv('training_100.csv')
df['timestamp']=pd.to_datetime(df['timestamp'])
df.head()
df.tail()
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 20154 | 2020-08-28 01:56:00+02:00 | 0.998149 | 26467 | 0 |
| 20155 | 2020-08-28 01:57:00+02:00 | 0.998340 | 26502 | 0 |
| 20156 | 2020-08-28 01:58:00+02:00 | 0.998364 | 26887 | 0 |
| 20157 | 2020-08-28 01:59:00+02:00 | 0.998428 | 26712 | 0 |
| 20158 | 2020-08-28 02:00:00+02:00 | 0.997407 | 29694 | 0 |
import plotly.express as px
fig = px.line(df, x="timestamp", y=['kpi_value', 'anomaly_label'], title='KPI Value and Anomaly Label', template='plotly_dark')
fig.show()
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['timestamp'].min(), df['timestamp'].max()
(Timestamp('2020-08-14 02:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-28 02:00:00+0200', tz='pytz.FixedOffset(120)'))
time=df['timestamp']
time_point=time[round(len(df)*0.8)]
time_point
Timestamp('2020-08-25 06:47:00+0200', tz='pytz.FixedOffset(120)')
train, test = df.loc[df['timestamp'] <= time_point], df.loc[df['timestamp'] > time_point]
train.shape, test.shape
((16128, 4), (4031, 4))
scaler = StandardScaler()
scaler = scaler.fit(train[['kpi_value']])
#train['kpi_value'] = scaler.transform(train[['kpi_value']])
#test['kpi_value'] = scaler.transform(test[['kpi_value']])
TIME_STEPS=30
def create_sequences(X, y, time_steps=TIME_STEPS):
Xs, ys = [], []
for i in range(len(X)-time_steps): #Should reach max-time_step since anything after that would not predict correctly. Final iteration train from [len(X)-time_steps] until [len(X)-1], then use len(X) to predict the last value
Xs.append(X.iloc[i:(i+time_steps)].values)
ys.append(y.iloc[i+time_steps])
return np.array(Xs), np.array(ys)
X_train, y_train = create_sequences(train[['kpi_value']], train['kpi_value'])
X_test, y_test = create_sequences(test[['kpi_value']], test['kpi_value'])
print(f'Training shape: {X_train.shape}')
print(f'Testing shape: {X_test.shape}')
Training shape: (16098, 30, 1) Testing shape: (4001, 30, 1)
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(50, 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm (LSTM) (None, 50, 50) 10400 _________________________________________________________________ lstm_1 (LSTM) (None, 64) 29440 _________________________________________________________________ dense (Dense) (None, 1) 65 ================================================================= Total params: 39,905 Trainable params: 39,905 Non-trainable params: 0 _________________________________________________________________
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1,
callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min')], shuffle=False)
Epoch 1/100 WARNING:tensorflow:Model was constructed with shape (None, 50, 1) for input KerasTensor(type_spec=TensorSpec(shape=(None, 50, 1), dtype=tf.float32, name='lstm_input'), name='lstm_input', description="created by layer 'lstm_input'"), but it was called on an input with incompatible shape (None, 30, 1). WARNING:tensorflow:Model was constructed with shape (None, 50, 1) for input KerasTensor(type_spec=TensorSpec(shape=(None, 50, 1), dtype=tf.float32, name='lstm_input'), name='lstm_input', description="created by layer 'lstm_input'"), but it was called on an input with incompatible shape (None, 30, 1). 453/453 [==============================] - ETA: 0s - loss: 0.0451WARNING:tensorflow:Model was constructed with shape (None, 50, 1) for input KerasTensor(type_spec=TensorSpec(shape=(None, 50, 1), dtype=tf.float32, name='lstm_input'), name='lstm_input', description="created by layer 'lstm_input'"), but it was called on an input with incompatible shape (None, 30, 1). 453/453 [==============================] - 54s 77ms/step - loss: 0.0450 - val_loss: 1.0043e-05 Epoch 2/100 453/453 [==============================] - 31s 69ms/step - loss: 4.7072e-06 - val_loss: 1.0183e-05 Epoch 3/100 453/453 [==============================] - 32s 71ms/step - loss: 4.9759e-06 - val_loss: 8.4723e-06 Epoch 4/100 453/453 [==============================] - 32s 70ms/step - loss: 5.4492e-06 - val_loss: 4.0257e-06 Epoch 5/100 453/453 [==============================] - 28s 61ms/step - loss: 5.8730e-06 - val_loss: 4.0391e-06 Epoch 6/100 453/453 [==============================] - 30s 66ms/step - loss: 6.0924e-06 - val_loss: 9.7875e-06 Epoch 7/100 453/453 [==============================] - 27s 59ms/step - loss: 5.6263e-06 - val_loss: 2.5442e-05
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend();
model.evaluate(X_test, y_test)
126/126 [==============================] - 2s 16ms/step - loss: 2.6967e-05
2.6967309167957865e-05
train_prediction = model.predict(X_train, verbose=0)
print(f"Train Prediction Shape: {train_prediction.shape}")
print(f"y Train Shape: {y_train.shape}")
train_mae_loss = np.mean(np.abs(train_prediction - y_train), axis=1)
plt.hist(train_mae_loss, bins=50)
plt.xlabel('Train MAE loss')
plt.ylabel('Number of Samples');
threshold = np.max(train_mae_loss)
print(f'Reconstruction error threshold: {threshold}')
WARNING:tensorflow:Model was constructed with shape (None, 50, 1) for input KerasTensor(type_spec=TensorSpec(shape=(None, 50, 1), dtype=tf.float32, name='lstm_input'), name='lstm_input', description="created by layer 'lstm_input'"), but it was called on an input with incompatible shape (None, 30, 1). Train Prediction Shape: (16098, 1) y Train Shape: (16098,) Reconstruction error threshold: 0.008372797857322641
test_prediction = model.predict(X_test, verbose=0)
print(f"Test Prediction Shape: {test_prediction.shape}")
print(f"y Test Shape: {y_test.shape}")
test_mae_loss = np.mean(np.abs(test_prediction - y_test), axis=1)
plt.hist(test_mae_loss, bins=50)
plt.xlabel('Test MAE loss')
plt.ylabel('Number of Samples');
threshold = 0.6*np.max(test_mae_loss)
print(f'Reconstruction error threshold: {threshold}')
Test Prediction Shape: (4001, 1) y Test Shape: (4001,) Reconstruction error threshold: 0.005197885387023382
test_score_df = pd.DataFrame(test[TIME_STEPS:])
test_score_df['loss'] = test_mae_loss
test_score_df['threshold'] = threshold
test_score_df['anomaly'] = test_score_df['loss'] > test_score_df['threshold']
test_score_df['kpi_value'] = test[TIME_STEPS:]['kpi_value']
y_pred=model.predict(X_test)
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_score_df['timestamp'], y=test_score_df['loss'], name='Test loss'))
fig.add_trace(go.Scatter(x=test_score_df['timestamp'], y=test_score_df['threshold'], name='Threshold'))
fig.add_trace(go.Scatter(x=df['timestamp'][16404:20158], y=df['anomaly_label'], name='Labeled Anomalies'))
fig.update_layout(showlegend=True, title='Test loss vs. Threshold')
fig.show()
anomalies = test_score_df.loc[test_score_df['anomaly'] == True]
anomalies.shape
(1749, 7)
anomalies
| timestamp | kpi_value | request_count | anomaly_label | loss | threshold | anomaly | |
|---|---|---|---|---|---|---|---|
| 16158 | 2020-08-25 07:18:00+02:00 | 0.996031 | 14864 | 0 | 0.006891 | 0.005198 | True |
| 16159 | 2020-08-25 07:19:00+02:00 | 0.996929 | 14977 | 0 | 0.006901 | 0.005198 | True |
| 16160 | 2020-08-25 07:20:00+02:00 | 0.996093 | 15359 | 0 | 0.006925 | 0.005198 | True |
| 16161 | 2020-08-25 07:21:00+02:00 | 0.996274 | 14761 | 0 | 0.006938 | 0.005198 | True |
| 16162 | 2020-08-25 07:22:00+02:00 | 0.994142 | 14169 | 0 | 0.006947 | 0.005198 | True |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 20154 | 2020-08-28 01:56:00+02:00 | 0.998149 | 26467 | 0 | 0.008358 | 0.005198 | True |
| 20155 | 2020-08-28 01:57:00+02:00 | 0.998340 | 26502 | 0 | 0.008341 | 0.005198 | True |
| 20156 | 2020-08-28 01:58:00+02:00 | 0.998364 | 26887 | 0 | 0.008325 | 0.005198 | True |
| 20157 | 2020-08-28 01:59:00+02:00 | 0.998428 | 26712 | 0 | 0.008310 | 0.005198 | True |
| 20158 | 2020-08-28 02:00:00+02:00 | 0.997407 | 29694 | 0 | 0.008299 | 0.005198 | True |
1749 rows × 7 columns
anomalies["anomaly"]
16158 True
16159 True
16160 True
16161 True
16162 True
...
20154 True
20155 True
20156 True
20157 True
20158 True
Name: anomaly, Length: 1749, dtype: bool
test_score_df['kpi_value']
16158 0.996031
16159 0.996929
16160 0.996093
16161 0.996274
16162 0.994142
...
20154 0.998149
20155 0.998340
20156 0.998364
20157 0.998428
20158 0.997407
Name: kpi_value, Length: 4001, dtype: float64
test_score_df['kpi_value'].values.reshape(-1,1)
array([[0.99603068],
[0.99692862],
[0.9960935 ],
...,
[0.99836352],
[0.99842767],
[0.99740688]])
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_score_df['timestamp'], y=scaler.inverse_transform(test_score_df['kpi_value'].values.reshape(1,-1)), name='Close price'))
fig.add_trace(go.Scatter(x=anomalies['timestamp'], y=scaler.inverse_transform(anomalies['kpi_value'].values.reshape(1,-1)), mode='markers', name='Anomaly'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()
loss=test_score_df['loss']
loss
threshold=test_score_df['threshold']
threshold.iloc[8]
0.005197885387023382
n=0
a=0
new_anomalies=[]
for i in range(len(test_score_df['loss'])):
if test_score_df['loss'].iloc[i] <= test_score_df['threshold'].iloc[i]:
n=n+1
t=0
new_anomalies.append(t)
elif test_score_df['loss'].iloc[i] > test_score_df['threshold'].iloc[i]:
a=a+1
t=1
new_anomalies.append(t)
print(f"Number of no anomalies: {n}")
print(f"Number of anomalies: {a}")
Number of no anomalies: 2252 Number of anomalies: 1749
label=test_score_df['anomaly_label']
len(label)-len(new_anomalies)
plt.plot(label)
plt.plot(new_anomalies)
[<matplotlib.lines.Line2D at 0x1de957b1400>]
df['kpi_value'].plot(figsize = (24, 5), legend = True)
df.plot(legend = True)
<AxesSubplot:>
label=df['anomaly_label']
print(len(label))
import math
20159
from sklearn.metrics import f1_score
f1_score(test['kpi_value'],y_pred, average='macro')